#delimit ;

/* this program sets up the MC loop for Table 2

It has the following inputs in the the program:

mcreps 		how many monte carlo reps
			something about models to estimate and save
			maybe something about files to use and save to?
*/


cap prog drop do_the_monte_carlo ;
prog def do_the_monte_carlo ;
syntax  , [mcreps(integer 3) bsreps(integer 4) numstates(integer 6) statelist(string) sourcedata(string) savedata(string)] ;

di "Table 4 loop.  MC Reps = `mcreps', BS Reps = `bsreps', Number of states sampled = `numstates'" ;


/* get ready to run the monte carlo */
tempfile dummies to_be_sampled tobsample main_data bsout pairs_bs_out ;
local mydate = subinstr("$S_DATE"," ","_",.) ;


local postlist = "b se_def se_clu se_clu_bs bs_misreps_b bs_misreps_t p_pairs_bs p_rad_res p_webb_res " ;
cap postclose mcoutput ;
qui postfile mcoutput numobs numstates K numbsreps `postlist' using "t4_post_G`numstates'_`mydate'.dta"  , replace ;


local lhs = "lnwage_sy" ;
local rhs = "policy" ;
local key_rhs = "policy" ;
local keyrhs = "`key_rhs'" ;
local reglhs = "`lhs'_deviation" ;
local regrhs = "`rhs'_deviation" ;

local beta_hypothesis = 0 ;

local command = "reg `reglhs' `regrhs' , noconstant cluster(pseudo_cluster) " ;


/* the main loop */
qui forvalues mc = 1/`mcreps' { ;

/* because we later set the seed for a bootstrap, we need to make sure to re-set the seed to a new
value for each monte carlo iteration.  Otherwise almost all runs will draw the same samples */
local myseed = 10101 + `mc' ;
set seed `myseed' ;

*noi di "In MC loop.  mc=`mc'.  $S_DATE $S_TIME" ;
noi di "." _continue ;
if (mod(`mc',100) == 1) { ; /* every X reps, report progress */
	noi di " `mc' " _continue ;
} ;

/* make sure we don't post last round's data to this round */
foreach zz in `postlist' { ;
	local `zz' = "." ;
	} ;

/* generate the data */


use `statelist' , replace ;
bsample `numstates' ;
/* assign treatment dummy to 1/2 of states (round down) */
gen sortvar = uniform() ;
sort sortvar ;
gen policy_st = _n / _N <= 0.5 ;
/* generate pseudo-cluster-ids for later */
sort statefip ;
qui by statefip: gen subid = _n ;
egen pseudo_cluster = group(statefip subid) ;
keep statefip subid pseudo_cluster policy_st ;
sort statefip subid ;
save `to_be_sampled' , replace ;

keep statefip ;
contract statefip ;
sort statefip ;
merge 1:m statefip using "`sourcedata'" , assert(match using) keep(match) nogenerate  ;

expand _freq ;  /* this was generated by the contract command; tells us how many replicates of each state to make */
sort state year ;
qui by state year: gen subid = _n ;
sort statefip subid ;
merge m:1 statefip subid using `to_be_sampled' , assert(match) keep(match) nogenerate ;

/* create the policy variable */
qui summ year ;
local yrmean = r(mean) ;
gen policy_yr = year >= `yrmean' ;
gen policy = policy_st * policy_yr ;
drop policy_st policy_yr subid _freq ;

/* make deviations from state and year Fixed Effects */
egen `lhs'_avg_s = mean(`lhs') , by(pseudo_cluster) ;    /* this should be moved outside the MC loop, for computational improvement.  maybe do that later */
egen `lhs'_avg_y = mean(`lhs') , by(year) ;
egen `keyrhs'_avg_s = mean(`keyrhs') , by(pseudo_cluster) ;    
egen `keyrhs'_avg_y = mean(`keyrhs') , by(year) ;

make_deviations `lhs' `rhs'  ;

/* estimate the models */

/* row 1, default standard error */
reg `reglhs' `regrhs' , noconstant ;
local b = _b[`regrhs'] ;
local se_def = _se[`regrhs'] ;
local K = e(rank) ;
local numobs = e(N) ;


/* generate "restricted, impose the null hypothesis" residuals */

replace `lhs' = `lhs' - `beta_hypothesis' * `rhs' ;
/* use make_deviations command to get deviations from state and year FEs.  These are the "restricted residuals".
	(this is because there are no other regressors in the model; just "policy" and the FEs.)
	the "restricted y-hats are the `lhs' above, minus the restricted residuals */
make_deviations `lhs' ;
cap drop resid_restricted ;
gen resid_restricted = `lhs'_deviation ;
gen yhat_restricted = `lhs' - resid_restricted ;
replace `lhs' = `lhs' + `beta_hypothesis' * `rhs' ;


/* get standard errors for rows 2-4 */
reg `reglhs' `regrhs' , noconstant cluster(pseudo_cluster) ;
local se_clu = _se[`keyrhs'] ;


sort pseudo_cluster year ;
qui save `main_data' , replace ;


/* do nonparametric bootstraps, for rows 5 and 6 */
tempfile pairs_bs_out ;
cap erase `pairs_bs_out' ;


/* calls program "table4_np_boot", which is defined in t34_programs.do.  That program re-does the "deviations from means" and then 
re-runs the regression, to get the betas and (clustered) standard errors. */
qui bootstrap b=r(mybeta) se=r(myse) , reps(`bsreps') nodots cluster(pseudo_cluster) idcluster(pseudo_cluster_2) 
	seed(10101) saving(`pairs_bs_out' , double)
	: table4_np_boot , lhs(`lhs') rhs(`rhs') keyrhs(`keyrhs') clusterid(pseudo_cluster_2) ;
local bs_misreps_b = e(N_misreps) ;

use `pairs_bs_out' , replace ;
qui summ b ;
local se_clu_bs = r(sd) ;

keep b se ;
gen t_pairs_bs = ((b - `b') / se) ;
*noi summ ;
keep t_pairs_bs ;
save `pairs_bs_out' , replace ;


/* Do percentile-T bootstraps, for rows 7 and 8  */
local main_t = (`b' - `beta_hypothesis') / `se_clu' ;

/* Set up postfiles for percentile-T bootstraps */
cap postclose bs_output ;
cap erase `bsout' ;
postfile bs_output t_rad_res t_webb_res using `bsout' ;

use `main_data' , replace ;
keep pseudo_cluster ;
contract pseudo_cluster ;
tempfile tobsample ;
qui save `tobsample' , replace ;



qui forvalues bb = 1/`bsreps' { ;

	/* for the wild bootstrap */
	/* take the cluster list, generate 2 sets of residual transformations */
	/* then merge these back onto main dataset, created transformed residuals and then transformed y-hats */
	/* then estimate the models, and save the t-statistics */



	use pseudo_cluster using `to_be_sampled' , replace ;
	gen my_uniform = uniform() ;
	gen wild_rademacher = -1 + 2 * (my_uniform >= 0.5) ;
	gen wild_webb = 	(-1) * sqrt(1.5) * (my_uniform > (0) & my_uniform <= (1/6)) +  
						(-1) * sqrt(1) * (my_uniform > (1/6) & my_uniform <= (2/6))  + 
						(-1) * sqrt(0.5) * (my_uniform > (2/6) & my_uniform <= (3/6)) + 
						(+1) * sqrt(0.5) * (my_uniform > (3/6) & my_uniform <= (4/6)) + 
						(+1) * sqrt(1) * (my_uniform > (4/6) & my_uniform <= (5/6))  + 
						(+1) * sqrt(1.5) * (my_uniform > (5/6) & my_uniform <= (6/6)) ; 
	
	keep pseudo_cluster wild_rademacher wild_webb ;
	sort pseudo_cluster ;
	merge 1:m pseudo_cluster using `main_data' , assert(match) keep(match) nogenerate ;	
	
	/* create transformed residuals and new wild-outcome-variables */
	gen resid_wild_rad_restricted = resid_restricted * wild_rademacher ;
	gen resid_wild_webb_restricted = resid_restricted * wild_webb ;
	
	gen y_wild_rad_restricted = yhat_restricted + resid_wild_rad_restricted ;
	gen y_wild_webb_restricted = yhat_restricted + resid_wild_webb_restricted ;

	/* now estimate cluster-robust models on each of these, generating t-statistics.
		For the restricted model, the t-stat is based on the null hypothesis.  for the unrestricted
		model the t-stat is based on the main (first) estiamted beta */

	/* need to compute "deviations from means" for the new LHS variables ... */
	/* to do this, make year and state specific averages, then call "make_deviations" code */
	cap drop y_wild_rad_restricted_avg_s ;
	cap drop y_wild_rad_restricted_avg_y ;
	cap drop y_wild_webb_restricted_avg_s ;
	cap drop y_wild_webb_restricted_avg_y ;

	egen y_wild_rad_restricted_avg_s = mean(y_wild_rad_restricted) , by(pseudo_cluster) ;    /* this should be moved outside the MC loop, for computational improvement.  maybe do that later */
	egen y_wild_rad_restricted_avg_y = mean(y_wild_rad_restricted) , by(year) ;
	egen y_wild_webb_restricted_avg_s = mean(y_wild_webb_restricted) , by(pseudo_cluster) ;    /* this should be moved outside the MC loop, for computational improvement.  maybe do that later */
	egen y_wild_webb_restricted_avg_y = mean(y_wild_webb_restricted) , by(year) ;
	
	make_deviations y_wild_rad_restricted y_wild_webb_restricted ;


	local shortcommand = subinstr("`command'","`reglhs'","y_wild_rad_restricted_deviation",.) ;
	`shortcommand' ;
	local b_wild_rademacher_restricted = _b[`key_rhs'] ;
	local se_wild_rademacher_restricted = _se[`key_rhs'] ;

	local shortcommand = subinstr("`command'","`reglhs'","y_wild_webb_restricted_deviation",.) ;
	`shortcommand' ;
	local b_wild_webb_restricted = _b[`key_rhs'] ;
	local se_wild_webb_restricted = _se[`key_rhs'] ;

	/* make the t-stats ; store away into a postfile */

	local t_wild_rademacher_restricted  = (`b_wild_rademacher_restricted ' - `beta_hypothesis') 
		/ `se_wild_rademacher_restricted' ;
	local t_wild_webb_restricted  = (`b_wild_webb_restricted ' - `beta_hypothesis') 
		/ `se_wild_webb_restricted' ;

	post bs_output (`t_wild_rademacher_restricted') (`t_wild_webb_restricted') ;
	
	

	
} ;

postclose bs_output ;



/* identify percentiles in the bootstrap distribution, to get p-values and/or rejection rates */

/* to get these percentiles, we will make lists of t-stats, with the "main" t-stat stuck into this list
we wante to know what the p-value of this main t-stat is.  Because with wild, few clusters, we can get intervals,
we will use mean-p-value [is this what we want?] of the interval.   */

drop _all ;
use `bsout' ;
merge 1:1 _n using `pairs_bs_out' , nogenerate ;
summ ;
save `bsout' , replace ;

local main_t = (`b' - `beta_hypothesis') / `se_clu' ;
drop _all ;
set obs 1 ;
gen t_rad_res = `main_t' ;
gen t_webb_res = `main_t' ;
gen t_pairs_bs = `main_t' ;

append using `bsout' ;
gen rank = . ;

gen num_miss = t_pairs_bs == . ;
qui summ num_miss ;
local bs_misreps_t = r(sum) ;


foreach var in pairs_bs rad_res webb_res { ;

	sort t_`var' ;
	replace rank = _n ;
	summ t_`var' ;
	local maxrank = r(N) ;    /* should be #bs reps; but this allows for missing bs values */
	summ rank if abs(t_`var' - `main_t') < 0.0001 ; /* allow for machine error in t-stat computations in bootstrap step */
	local meanrank = r(mean) ;
	local pctile = `meanrank' / `maxrank' ;
	local myp = 2 * min(`pctile' , (1-`pctile')) ;
	local p_`var' = `myp' ;
	
} ;



/* save the results */

post mcoutput (`numobs') (`numstates') (`K') (`bsreps') (`b') (`se_def') (`se_clu') (`se_clu_bs') (`bs_misreps_b') (`bs_misreps_t')
	(`p_pairs_bs') (`p_rad_res') (`p_webb_res') ;


} ;


/* now, take the whole package of results and save them */

postclose mcoutput ;
use "t4_post_G`numstates'_`mydate'.dta" , replace ;
qui save `savedata' , replace ;

di ; di ;

end ;

